//
//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
//
//  Use of this source code is governed by a BSD-style license
//  that can be found in the LICENSE file in the root of the source
//  tree. An additional intellectual property rights grant can be found
//  in the file PATENTS.  All contributing project authors may
//  be found in the AUTHORS file in the root of the source tree.
//
//
//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
//  to support float instead of SC32.
//

//
// Description:
// Compute a Radix 4 FFT stage for a N point complex signal
//
//


// Include standard headers

#include "dl/api/arm/arm64COMM_s.h"
#include "dl/api/arm/omxtypes_s.h"


// Import symbols required from other files
// (For example tables)




// Set debugging level
//DEBUG_ON    SETL {TRUE}



// Guarding implementation by the processor name




// Guarding implementation by the processor name


// Import symbols required from other files
// (For example tables)


//Input Registers

#define pSrc            x0
#define pDst            x1
#define pTwiddle        x2
#define	pSubFFTNum	x3
#define pSubFFTSize	x4	



//Output Registers


//Local Scratch Registers

#define subFFTNum       x5
#define subFFTSize      x6
#define grpCount        x7
#define grpCount32      w7
#define pointStep       x8
#define pointStep32     w8
#define outPointStep    x9
#define stepTwiddle     x10
#define setCount        x11
#define srcStep         x12
#define setStep         x13
#define dstStep         x14
#define twStep          x15

// Neon Registers

#define dW1     v0.2s
#define dW2     v1.2s
#define dW3     v2.2s

#define dXr0    v4.2s
#define dXi0    v5.2s
#define dXr1    v6.2s
#define dXi1    v7.2s
#define dXr2    v8.2s
#define dXi2    v9.2s
#define dXr3    v10.2s
#define dXi3    v11.2s
#define dYr0    v12.2s
#define dYi0    v13.2s
#define dYr1    v14.2s
#define dYi1    v15.2s
#define dYr2    v16.2s
#define dYi2    v17.2s
#define dYr3    v18.2s
#define dYi3    v19.2s
#define dZr0    v20.2s
#define dZi0    v21.2s
#define dZr1    v22.2s
#define dZi1    v23.2s
#define dZr2    v24.2s
#define dZi2    v25.2s
#define dZr3    v26.2s
#define dZi3    v27.2s

        .macro FFTSTAGE scaled, inverse , name

        // Define stack arguments

        // Move args values into our work registers
        ldr     subFFTNum, [pSubFFTNum]
        ldr     subFFTSize, [pSubFFTSize]

        // Update grpCount and grpSize rightaway inorder to reuse
        // pGrpCount and pGrpSize regs

        LSL     grpCount,subFFTSize,#2
        LSR     subFFTNum,subFFTNum,#2
        MOV     subFFTSize,grpCount

        ld1      {dW1},[pTwiddle]                    //[wi | wr]
        // pT0+1 increments pT0 by 8 bytes
        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
        lsl     pointStep,subFFTNum, #1

        // pOut0+1 increments pOut0 by 8 bytes
        // pOut0+outPointStep == increment of 8*outPointStep bytes
        //   = 2*size bytes

        MOV     stepTwiddle,#0
        ld1      {dW2},[pTwiddle]                    //[wi | wr]
        smull   outPointStep,grpCount32,pointStep32

        LSL     pointStep,pointStep,#2             // 2*grpSize

        ld1      {dW3},[pTwiddle]                  //[wi | wr]
        lsl     srcStep,pointStep, #1              // srcStep = 2*pointStep

        ADD     setStep,srcStep,pointStep          // setStep = 3*pointStep

        rsb     setStep,setStep,#0                 // setStep = - 3*pointStep
        SUB     srcStep,srcStep,#16                // srcStep = 2*pointStep-16

        lsl     dstStep,outPointStep, #1

        ADD     dstStep,dstStep,outPointStep       // dstStep = 3*outPointStep
        // dstStep = - 3*outPointStep+16
        rsb     dstStep,dstStep,#16


radix4GrpLoop\name :

        ld2     {dXr0,dXi0},[pSrc],pointStep       //  data[0]
        ADD      stepTwiddle,stepTwiddle,pointStep
        ld2     {dXr1,dXi1},[pSrc],pointStep       //  data[1]
        // set pTwiddle to the first point
        ADD      pTwiddle,pTwiddle,stepTwiddle
        ld2     {dXr2,dXi2},[pSrc],pointStep       //  data[2]
        lsl      twStep,stepTwiddle, #2

        //  data[3] & update pSrc for the next set
        ld2     {dXr3,dXi3},[pSrc],setStep
        SUB      twStep,stepTwiddle,twStep         // twStep = -3*stepTwiddle

        lsr      setCount,pointStep, #3

        // set pSrc to data[0] of the next set
        ADD     pSrc,pSrc,#16
        // increment to data[1] of the next set
        ADD     pSrc,pSrc,pointStep


        // Loop on the sets

radix4SetLoop\name :



        .ifeqs  "\inverse", "TRUE"
            fmul   dZr1,dXr1,dW1[0]
            fmul   dZi1,dXi1,dW1[0]
            fmul   dZr2,dXr2,dW2[0]
            fmul   dZi2,dXi2,dW2[0]
            fmul   dZr3,dXr3,dW3[0]
            fmul   dZi3,dXi3,dW3[0]

            fmla   dZr1,dXi1,dW1[1]                // real part
            fmls   dZi1,dXr1,dW1[1]                // imag part

            //  data[1] for next iteration
            ld2     {dXr1,dXi1},[pSrc],pointStep

            fmla   dZr2,dXi2,dW2[1]                // real part
            fmls   dZi2,dXr2,dW2[1]                // imag part

            //  data[2] for next iteration
            ld2     {dXr2,dXi2},[pSrc],pointStep

            fmla   dZr3,dXi3,dW3[1]                // real part
            fmls   dZi3,dXr3,dW3[1]                // imag part
        .else
            fmul   dZr1,dXr1,dW1[0]
            fmul   dZi1,dXi1,dW1[0]
            fmul   dZr2,dXr2,dW2[0]
            fmul   dZi2,dXi2,dW2[0]
            fmul   dZr3,dXr3,dW3[0]
            fmul   dZi3,dXi3,dW3[0]

            fmls   dZr1,dXi1,dW1[1]                // real part
            fmla   dZi1,dXr1,dW1[1]                // imag part

            //  data[1] for next iteration
            ld2     {dXr1,dXi1},[pSrc],pointStep

            fmls   dZr2,dXi2,dW2[1]                // real part
            fmla   dZi2,dXr2,dW2[1]                // imag part

            //  data[2] for next iteration
            ld2     {dXr2,dXi2},[pSrc],pointStep

            fmls   dZr3,dXi3,dW3[1]                // real part
            fmla   dZi3,dXr3,dW3[1]                // imag part
        .endif

        //  data[3] & update pSrc to data[0]
        // But don't read on the very last iteration because that reads past 
        // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
        cmp     grpCount, #4
        
        b.ne    skipUpdate\name
        cmp     setCount, #2
        b.ne    skipUpdate\name
        add     pSrc, pSrc, setStep
        beq     radix4SkipRead\name
skipUpdate\name:
        ld2     {dXr3,dXi3},[pSrc],setStep
radix4SkipRead\name:

        SUBS    setCount,setCount,#2

        // finish first stage of 4 point FFT
        // fadd    qY0,qX0,qZ2
        // fsub    qY2,qX0,qZ2
        fadd    dYr0,dXr0,dZr2
        fsub    dYr2,dXr0,dZr2
        fadd    dYi0,dXi0,dZi2
        fsub    dYi2,dXi0,dZi2

        //  data[0] for next iteration
        ld2     {dXr0,dXi0},[pSrc], #16
        // fadd    qY1,qZ1,qZ3
        // fsub    qY3,qZ1,qZ3
        fadd    dYr1,dZr1,dZr3
        fsub    dYr3,dZr1,dZr3
        fadd    dYi1,dZi1,dZi3
        fsub    dYi3,dZi1,dZi3

        // finish second stage of 4 point FFT

        // fsub    qZ0,qY2,qY1
        fsub    dZr0,dYr2,dYr1
        fsub    dZi0,dYi2,dYi1

        .ifeqs  "\inverse", "TRUE"

            fadd    dZr3,dYr0,dYi3
            st2     {dZr0,dZi0},[pDst],outPointStep
            fsub    dZi3,dYi0,dYr3

            // fadd    qZ2,qY2,qY1
            fadd    dZr2,dYr2,dYr1
            fadd    dZi2,dYi2,dYi1

            st2     {dZr3,dZi3},[pDst],outPointStep

            fsub    dZr1,dYr0,dYi3
            st2     {dZr2,dZi2},[pDst],outPointStep
            fadd    dZi1,dYi0,dYr3

            st2     {dZr1,dZi1},[pDst],dstStep


        .else

            fsub    dZr1,dYr0,dYi3
            st2     {dZr0,dZi0},[pDst],outPointStep
            fadd    dZi1,dYi0,dYr3

            // fadd    qZ2,qY2,qY1
            fadd    dZr2,dYr2,dYr1
            fadd    dZi2,dYi2,dYi1

            st2     {dZr1,dZi1},[pDst],outPointStep

            fadd    dZr3,dYr0,dYi3
            st2     {dZr2,dZi2},[pDst],outPointStep
            fsub    dZi3,dYi0,dYr3

            st2     {dZr3,dZi3},[pDst],dstStep


        .endif

        // increment to data[1] of the next set
        ADD     pSrc,pSrc,pointStep
        BGT     radix4SetLoop\name


        ld1      {dW1},[pTwiddle],stepTwiddle    //[wi | wr]
        // subtract 4 since grpCount multiplied by 4
        SUBS    grpCount,grpCount,#4
        ld1      {dW2},[pTwiddle],stepTwiddle    //[wi | wr]
        // increment pSrc for the next grp
        ADD     pSrc,pSrc,srcStep
        ld1      {dW3},[pTwiddle],twStep         //[wi | wr]
        BGT     radix4GrpLoop\name

        str     subFFTNum, [pSubFFTNum]
        str     subFFTSize, [pSubFFTSize]

        .endm


        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
            FFTSTAGE "FALSE","FALSE",FWD
        M_END


        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
            FFTSTAGE "FALSE","TRUE",INV
        M_END


        .end
